In [1]:
import numpy as np
import pandas as pd
import os
import pickle

PATHS


In [2]:
DATA_DIRECTORY = "Q:\\p_eaglesense\\eaglesense\\data\\topviewkinect"
PREPROCESSED_DIRECTORY = DATA_DIRECTORY + "\\all"
if not os.path.exists(PREPROCESSED_DIRECTORY):
    os.makedirs(PREPROCESSED_DIRECTORY)

In [3]:
FEATURE_SET = "test"

DATA SANITY CHECK


In [4]:
for dataset_id in next(os.walk(DATA_DIRECTORY))[1]:
    if not dataset_id.isdigit():
        continue
    elif dataset_id.startswith(("1", "3", "4", "6", "7")):
        continue

    features_csv = "{root}/{dataset}/features.csv".format(root=DATA_DIRECTORY, dataset=dataset_id)
    features_df = pd.read_csv(features_csv)
    labels_csv = "{root}/{dataset}/labels.csv".format(root=DATA_DIRECTORY, dataset=dataset_id)
    labels_df = pd.read_csv(labels_csv)
    
    if -1 in labels_df["activity"].values:
        print(dataset_id, "Missing labels")
    
    if 1 in labels_df["skeleton_id"].values:
        print(dataset_id, "Multiple people labels")

    if 1 in features_df["skeleton_id"].values:
        print(dataset_id, "Multiple people features")

PEOPLE DETECTION ACCURACY


In [21]:
num_empty_labels = 0
num_nonempty_labels = 0
num_nonempty_detected = 0
num_nonempty_tracked = 0

PROCESSING TIME


In [22]:
features_time = list()
total_time = list()

In [23]:
for dataset_id in next(os.walk(DATA_DIRECTORY))[1]:
    if not dataset_id.isdigit():
        continue
    elif dataset_id.startswith(("1", "3", "4", "6", "7")):
        continue

    features_csv = "{root}/{dataset}/features.csv".format(root=DATA_DIRECTORY, dataset=dataset_id)
    features_df = pd.read_csv(features_csv)
    labels_csv = "{root}/{dataset}/labels.csv".format(root=DATA_DIRECTORY, dataset=dataset_id)
    labels_df = pd.read_csv(labels_csv)
    
    nonempty_labels_df = labels_df.loc[labels_df["activity"] != 6]
    nonempty_detected_labels_df = nonempty_labels_df.loc[nonempty_labels_df["skeleton_id"] >= 0]
    nonempty_detected_frame_indices = nonempty_detected_labels_df["frame_id"].values

    # activity tracked
    activity_tracked_features_df = features_df.loc[features_df["frame_id"].isin(nonempty_detected_frame_indices)]

    num_empty_labels += len(labels_df) - len(nonempty_labels_df)
    num_nonempty_labels += len(nonempty_labels_df)
    num_nonempty_detected += len(nonempty_detected_labels_df)
    num_nonempty_tracked += len(activity_tracked_features_df)
    
    # processing time
    processing_csv = "{root}/{dataset}/processing.csv".format(root=DATA_DIRECTORY, dataset=dataset_id)
    processing_df = pd.read_csv(processing_csv)
    processing_df = processing_df.loc[processing_df["frame_id"].isin(nonempty_detected_frame_indices)]
    
    features_time.extend(processing_df["features_time"].values)
    total_time.extend(processing_df["total_time"].values)

In [28]:
avg_features_time = np.mean(features_time)
avg_features_time


Out[28]:
2.1217542584129623

In [29]:
std_features_time = np.std(features_time)
std_features_time


Out[29]:
0.41300443098096312

In [30]:
avg_total_time = np.mean(total_time)
avg_total_time


Out[30]:
11.164455234732031

In [31]:
std_total_time = np.std(total_time)
std_total_time


Out[31]:
1.4047252699354558

In [37]:
num_empty_labels


Out[37]:
5232

In [38]:
num_nonempty_labels


Out[38]:
79760

In [39]:
total_frames = num_empty_labels + num_nonempty_labels
total_frames


Out[39]:
84992

In [40]:
num_nonempty_labels / total_frames


Out[40]:
0.938441265060241

In [41]:
people_detection_accuracy = num_nonempty_detected / num_nonempty_labels
people_detection_accuracy


Out[41]:
0.9999122367101304

In [20]:
num_nonempty_detected


Out[20]:
262155

In [43]:
num_nonempty_tracked


Out[43]:
77024

In [44]:
num_nonempty_tracked/ total_frames


Out[44]:
0.90625

In [45]:
features_time / num_nonempty_tracked


Out[45]:
2.1217542584129623

In [46]:
total_time / num_nonempty_tracked


Out[46]:
11.164455234732031

Features and Labels


In [7]:
ignored_features_columns = ["frame_id", "skeleton_id", "x", "y", "z"]
ignored_features_columns


Out[7]:
['frame_id', 'skeleton_id', 'x', 'y', 'z']

In [8]:
ignored_labels_columns = ["frame_id", "skeleton_id"]

In [ ]:
all_features_csv = "{root}/{tag}_features.csv".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
all_labels_csv = "{root}/{tag}_labels.csv".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
open(all_features_csv, "w").close()
open(all_labels_csv, "w").close()

header=True

with open(all_features_csv, "a") as features_f:
    with open(all_labels_csv, "a") as labels_f:
        for dataset_id in next(os.walk(DATA_DIRECTORY))[1]:
            if not dataset_id.isdigit():
                continue
            elif dataset_id.startswith(("1", "3", "4", "6", "7")):
                continue
            else:
                print(dataset_id, "... ", end="")
            
            labels_csv = "{root}/{dataset}/labels.csv".format(root=DATA_DIRECTORY, dataset=dataset_id)
            labels_df = pd.read_csv(labels_csv)
            features_csv = "{root}/{dataset}/features.csv".format(root=DATA_DIRECTORY, dataset=dataset_id)
            features_df = pd.read_csv(features_csv, low_memory=False)
            
            evaluation_labels_df = labels_df.loc[labels_df["skeleton_id"] >= 0]
            evaluation_labels_df = evaluation_labels_df.loc[evaluation_labels_df["activity"] != 6]
#             evaluation_labels_df = evaluation_labels_df.loc[evaluation_labels_df["skeleton_id"] == 0]
            evaluation_frame_indices = evaluation_labels_df["frame_id"].values
    
            # activity tracked
            evaluation_features_df = features_df.loc[features_df["frame_id"].isin(evaluation_frame_indices)]
#             evaluation_features_df = evaluation_features_df[evaluation_features_df["skeleton_id"] == 0]
            final_frame_indices = evaluation_features_df["frame_id"].values
            
            evaluation_labels_df = evaluation_labels_df.loc[evaluation_labels_df["frame_id"].isin(final_frame_indices)]
            evaluation_labels_df = evaluation_labels_df.drop(labels=ignored_labels_columns, axis=1)
            evaluation_labels_df["subject"] = int(dataset_id)
            evaluation_labels_df.to_csv(labels_f, header=header, index=False)
            
            evaluation_features_df = evaluation_features_df.drop(labels=ignored_features_columns, axis=1)
            evaluation_features_df["subject"] = int(dataset_id)
            evaluation_features_df = evaluation_features_df.astype("float64")
            evaluation_features_df.to_csv(features_f, header=header, index=False)
            
            header = False

OVERVIEW


In [9]:
all_features_df = pd.read_csv(all_features_csv)

In [10]:
all_features_df.shape


Out[10]:
(77024, 73)

In [11]:
all_features_df.head()


Out[11]:
layer_area_0 layer_area_1 layer_area_2 layer_contours_0 layer_contours_1 layer_distance_0 layer_distance_1 layer_distance_2 layer_distance_3 layer_distance_4 ... interlayer_pos_16 interlayer_pos_17 extremities0 extreme_infrared_0 extreme_infrared_1 extreme_infrared_2 extreme_infrared_3 extreme_infrared_4 extreme_infrared_5 subject
0 0.297578 0.411765 0.290657 3.0 3.0 16.5529 26.6833 26.0192 26.6833 201.0 ... -26.0 -107.0 4.0 0.0 10.0 11.5 11.5 0.0 11.5 2001.0
1 0.310345 0.419238 0.270417 3.0 3.0 16.4012 26.4764 26.0192 26.4764 191.5 ... -26.0 -105.0 5.0 0.5 9.0 11.0 1.0 0.5 11.0 2001.0
2 0.318015 0.386029 0.295956 3.0 3.0 16.1245 26.2488 27.0185 26.2488 174.5 ... -26.0 -104.0 5.0 0.0 12.5 4.5 4.5 0.5 13.0 2001.0
3 0.348399 0.384181 0.267420 3.0 3.0 16.4012 26.4197 26.4764 26.4197 164.0 ... -25.0 -103.0 5.0 0.0 6.0 4.5 0.0 0.0 7.0 2001.0
4 0.356383 0.370567 0.273050 3.0 3.0 17.7200 27.4591 27.4591 27.4591 164.5 ... -26.0 -107.0 3.0 0.0 0.0 0.5 0.0 0.0 0.5 2001.0

5 rows × 73 columns


In [12]:
all_labels_df = pd.read_csv(all_labels_csv)

In [13]:
all_labels_df.shape


Out[13]:
(77024, 4)

In [14]:
all_labels_df.head()


Out[14]:
activity orientation orientation_accurate subject
0 0 130.0 -1 2001
1 0 130.0 -1 2001
2 0 120.0 -1 2001
3 0 130.0 -1 2001
4 0 150.0 -1 2001

In [24]:
np.unique(all_labels_df["activity"])


Out[24]:
array([0, 1, 2, 3, 4, 5], dtype=int64)

In [25]:
ACTIVITIES = ["Standing", "Sitting", "Pointing", "Phone", "Tablet", "Paper"]
num_activities = len(ACTIVITIES)
num_activities


Out[25]:
6

In [26]:
unique_subjects = all_features_df["subject"].unique()
unique_subjects


Out[26]:
array([ 2001.,  2002.,  2003.,  2004.,  2005.,  2006.,  2007.,  2008.,
        2009.,  2010.,  2011.,  2012.])

In [27]:
num_subjects = len(unique_subjects)
num_subjects


Out[27]:
12

In [28]:
feature_vector = all_features_df.drop(["subject"], axis=1)
num_features = feature_vector.shape[1]
num_features


Out[28]:
72

DATA


In [29]:
s1_data_path = "{root}/{tag}_s1_data.pickle".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
s2_data_path = "{root}/{tag}_s2_data.pickle".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
cs_data_path = "{root}/{tag}_cs_data.pickle".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
noinfrared_data_path = "{root}/{tag}_cs_noinfrared_data.pickle".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)

SAMPLES TESTS


In [30]:
def samples_test_split(features_df, labels_df, train_test_ratio, seed):
    
    # Get training sizes
    training_sizes_per_subject = np.zeros((num_subjects, num_activities), dtype=np.int64)
    for subject_idx, subject_id in enumerate(unique_subjects):
        subject_activities = labels_df[labels_df["subject"] == subject_id]["activity"].values
        subject_activities_bin = np.bincount(np.squeeze(subject_activities))
        training_sizes_per_subject[subject_idx] = np.array([int(size * train_test_ratio) for size in subject_activities_bin])
    
    # Get training and testing data
    X_train = np.array([], dtype=np.float64).reshape(0, num_features)
    y_train = np.array([], dtype=np.int32).reshape(0, 1)
    X_test = np.array([], dtype=np.float64).reshape(0, num_features)
    y_test = np.array([], dtype=np.int32).reshape(0, 1)

    # Stratified sampling
    for subject_idx, subject_id in enumerate(unique_subjects):
        subject_features = features_df[features_df["subject"] == subject_id]
        subject_features = subject_features.drop(["subject"], axis=1)
        subject_labels = labels_df[labels_df["subject"] == subject_id]
        subject_labels = subject_labels[["activity"]]

        for activity_idx in range(num_activities):
            num_activity_samples = training_sizes_per_subject[subject_idx, activity_idx]
            activity_labels_df = subject_labels[subject_labels["activity"] == activity_idx]
            activity_train_labels_df = activity_labels_df.sample(n=num_activity_samples, replace=False, random_state=seed)
            
            activity_all_indices = list(activity_labels_df.index.values)
            activity_train_indices = list(activity_train_labels_df.index.values)
            activity_test_indices =  [idx for idx in activity_all_indices if idx not in activity_train_indices]
            
            acitivty_X_train = subject_features.ix[activity_train_indices]
            activity_y_train = subject_labels.ix[activity_train_indices]
            acitivty_X_test = subject_features.ix[activity_test_indices]
            activity_y_test = subject_labels.ix[activity_test_indices]

            X_train = np.vstack([X_train, acitivty_X_train.values])
            y_train = np.vstack([y_train, activity_y_train.values])
            X_test = np.vstack([X_test, acitivty_X_test.values])
            y_test = np.vstack([y_test, activity_y_test.values])
            
    return X_train, y_train, X_test, y_test

SAMPLES TEST 1


In [31]:
s1_X_train, s1_y_train, s1_X_test, s1_y_test = samples_test_split(all_features_df, all_labels_df, 1/3, seed=42)

In [32]:
s1_X_train.shape


Out[32]:
(25653, 72)

In [33]:
s1_X_test.shape


Out[33]:
(51371, 72)

In [34]:
s1_data = {
    "X_train": s1_X_train,
    "y_train": s1_y_train,
    "X_test": s1_X_test,
    "y_test": s1_y_test
}
with open(s1_data_path, "wb") as f:
    pickle.dump(s1_data, f)

SAMPLES TEST 2


In [35]:
s2_X_train, s2_y_train, s2_X_test, s2_y_test = samples_test_split(all_features_df, all_labels_df, 2/3, seed=42)

In [36]:
s2_X_train.shape


Out[36]:
(51324, 72)

In [37]:
s2_X_test.shape


Out[37]:
(25700, 72)

In [38]:
s2_data = {
    "X_train": s2_X_train,
    "y_train": s2_y_train,
    "X_test": s2_X_test,
    "y_test": s2_y_test
}
with open(s2_data_path, "wb") as f:
    pickle.dump(s2_data, f)

CROSS SUBJECT TESTS


In [39]:
def crosssubject_test_split(features_df, labels_df, training_subjects_ids):
    num_features = features_df.shape[1] - 1
    
    X_train = np.array([], dtype=np.float64).reshape(0, num_features)
    y_train = np.array([], dtype=np.int32).reshape(0, 1)
    X_test = np.array([], dtype=np.float64).reshape(0, num_features)
    y_test = np.array([], dtype=np.int32).reshape(0, 1)

    for subject_id in unique_subjects:
        subject_features = features_df[features_df["subject"] == subject_id]
        subject_features = subject_features.drop(["subject"], axis=1)
        subject_labels = labels_df[labels_df["subject"] == subject_id]
        subject_labels = subject_labels[["activity"]]
        subject_X = subject_features.values
        subject_y = subject_labels.values

        if subject_id in training_subjects_ids:
            X_train = np.vstack([X_train, subject_X])
            y_train = np.vstack([y_train, subject_y])
        else:
            X_test = np.vstack([X_test, subject_X])
            y_test = np.vstack([y_test, subject_y])
    
    return X_train, y_train, X_test, y_test

CROSS-SUBJECT 1


In [40]:
CS_TRAIN_SUBJECTS_ID = [2001, 2003, 2005, 2007, 2009, 2011]

In [41]:
cs_X_train, cs_y_train, cs_X_test, cs_y_test = crosssubject_test_split(all_features_df, all_labels_df, CS_TRAIN_SUBJECTS_ID)

In [42]:
cs_X_train.shape


Out[42]:
(34945, 72)

In [43]:
cs_X_test.shape


Out[43]:
(42079, 72)

In [44]:
cs_data = {
    "X_train": cs_X_train,
    "y_train": cs_y_train,
    "X_test": cs_X_test,
    "y_test": cs_y_test
}
with open(cs_data_path, "wb") as f:
    pickle.dump(cs_data, f)

NO-INFRARED


In [45]:
noinfrared_features_cols = list()
for feature in all_features_df.columns:
    if feature.startswith("extreme_infrared_"):
        continue
    else:
        noinfrared_features_cols.append(feature)

In [46]:
noinfrared_features_df = all_features_df[noinfrared_features_cols]

In [47]:
num_noinfrared_features = noinfrared_features_df.shape[1] - 1
num_noinfrared_features


Out[47]:
66

In [48]:
noinfrared_X_train, noinfrared_y_train, noinfrared_X_test, noinfrared_y_test = crosssubject_test_split(
    noinfrared_features_df, all_labels_df, CS_TRAIN_SUBJECTS_ID)

In [49]:
noinfrared_X_train.shape


Out[49]:
(34945, 66)

In [50]:
noinfrared_X_test.shape


Out[50]:
(42079, 66)

In [51]:
noinfrared_cs_data = {
    "X_train": noinfrared_X_train,
    "y_train": noinfrared_y_train,
    "X_test": noinfrared_X_test,
    "y_test": noinfrared_y_test
}
with open(noinfrared_data_path, "wb") as f:
    pickle.dump(noinfrared_cs_data, f)